import numpy as np
from numpy.linalg import cholesky
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=False)
import random
from tqdm import tqdm_notebook
import plotly.offline.offline as of
from sklearn.metrics.cluster import normalized_mutual_info_score as NMI ##评价指标
Sigma=np.array([[1,0],[0,1]])
R=cholesky(Sigma)
mu1=np.array([1,-1])
mu2=np.array([5.5,-4.5])
mu3=np.array([1,4])
mu4=np.array([6,4.5])
mu5=np.array([9,0])
mu=np.vstack((mu1,mu2,mu3,mu4,mu5))
mu=np.sort(mu)
x1=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu1,np.zeros((500,1))],axis=1)
x2=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu2,np.ones((500,1))*1],axis=1)
x3=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu3,np.ones((500,1))*2],axis=1)
x4=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu4,np.ones((500,1))*3],axis=1)
x5=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu5,np.ones((500,1))*4],axis=1)
data=np.vstack((x1,x2,x3,x4,x5))
np.savetxt('data',data)
fig = go.Figure()
trace = go.Scatter(
x = x1[:,0],y = x1[:,1],
mode='markers',
name ="$id = {}$".format(1))
fig.add_trace(trace)
trace = go.Scatter(
x = x2[:,0],y = x2[:,1],
mode='markers',
name ="$id = {}$".format(2))
fig.add_trace(trace)
trace = go.Scatter(
x = x3[:,0],y = x3[:,1],
mode='markers',
name ="$id = {}$".format(3))
fig.add_trace(trace)
trace = go.Scatter(
x = x4[:,0],y = x4[:,1],
mode='markers',
name ="$id = {}$".format(4))
fig.add_trace(trace)
trace = go.Scatter(
x = x5[:,0],y = x5[:,1],
mode='markers',
name ="$id = {}$".format(5))
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
class K_Means(object):
# k是分组数;tolerance‘中心点误差’;max_iter是迭代次数
def __init__(self, k=2, tolerance=0.0001, max_iter=500):
self.k_ = k
self.tolerance_ = tolerance
self.max_iter_ = max_iter
def fit(self, data, c = None):
if not c:
## 随机初始化
self.centers_ = {}
for i in range(self.k_):
self.centers_[i] = data[i]
else:
self.centers_ = c
for j in range(self.max_iter_):
self.clf_ = {}
for i in range(self.k_):
self.clf_[i] = []
for feature in data:
distances = []
for center in self.centers_:
# 欧拉距离
# np.sqrt(np.sum((features-self.centers_[center])**2))
distances.append(np.linalg.norm(feature - self.centers_[center]))
classification = distances.index(min(distances))
self.clf_[classification].append(feature)
# print("分组情况:",self.clf_)
prev_centers = dict(self.centers_)
for c in self.clf_:
self.centers_[c] = np.average(self.clf_[c], axis=0)
# '中心点'是否在误差范围
optimized = True
for center in self.centers_:
org_centers = prev_centers[center]
cur_centers = self.centers_[center]
if np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
optimized = False
if optimized:
break
return j
def predict(self, p_data):
distances = [np.linalg.norm(p_data - self.centers_[center]) for center in self.centers_]
index = distances.index(min(distances))
return index
epoch = 10
NMI_random = []
NMI_center = []
iters_random = []
iters_center = []
for _ in range(epoch):
model = K_Means(k = 5 ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data)
result = []
for d in data:
result.append(model.predict(d))
result = np.array(result)
iters_random.append(iters)
NMI_random.append(NMI(result,data[:,-1]))
xs = [data[np.where(result==i)] for i in range(5)]
fig = go.Figure()
for i in range(5):
trace = go.Scatter(
x = xs[i][:,0],y = xs[i][:,1],
mode='markers',
name ="$id = {}$".format(i+1))
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
iters_random
NMI_random
data[:,:2].shape
for _ in range(epoch):
model = K_Means(k = 5 ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data[:,:2],{0:mu1,1:mu2,2:mu3,3:mu4,4:mu5})
result = []
for d in data[:,:2]:
result.append(model.predict(d))
result = np.array(result)
iters_center.append(iters)
NMI_center.append(NMI(result,data[:,-1]))
xs = [data[np.where(result==i)] for i in range(5)]
fig = go.Figure()
for i in range(5):
trace = go.Scatter(
x = xs[i][:,0],y = xs[i][:,1],
mode='markers',
name ="$id = {}$".format(i+1))
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
iters_center
NMI_center
fig = go.Figure()
trace = go.Scatter(
x = np.arange(0,epoch,1),y=iters_center,
name ="$center$")
fig.add_trace(trace)
trace = go.Scatter(
x = np.arange(0,epoch,1),y=iters_random,
name ="$random$")
fig.add_trace(trace)
fig.update_layout(
yaxis_title='$iter\_num$',
xaxis_title='$times$',
)
fig.show()
fig = go.Figure()
trace = go.Scatter(
x = np.arange(0,epoch,1),y=NMI_center,
name ="$center$")
fig.add_trace(trace)
trace = go.Scatter(
x = np.arange(0,epoch,1),y=NMI_random,
name ="$random$")
fig.add_trace(trace)
fig.update_layout(
yaxis_title='$NMI$',
xaxis_title='$times$',
)
fig.show()
k = 10
model = K_Means(k = k ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data)
result = []
for d in data:
result.append(model.predict(d))
result = np.array(result)
print(iters)
xs = [data[np.where(result==i)] for i in range(k)]
fig = go.Figure()
for i in range(k):
trace = go.Scatter(
x = xs[i][:,0],y = xs[i][:,1],
mode='markers',
name ="$id = {}$".format(i+1))
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
k = 2
model = K_Means(k = k ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data)
result = []
for d in data:
result.append(model.predict(d))
result = np.array(result)
print(iters)
xs = [data[np.where(result==i)] for i in range(k)]
fig = go.Figure()
for i in range(k):
trace = go.Scatter(
x = xs[i][:,0],y = xs[i][:,1],
mode='markers',
name ="$id = {}$".format(i+1))
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
ks = [2,3,5,6,10,50]
avg = 3
NMIs = []
iters_time = []
for k in ks:
avgs_NMI = []
avgs_iter = []
for _ in range(avg):
model = K_Means(k = k ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data)
result = []
for d in data:
result.append(model.predict(d))
result = np.array(result)
avgs_iter.append(iters)
avgs_NMI.append(NMI(result,data[:,-1]))
NMIs.append(sum(avgs_NMI)/len(avgs_NMI))
iters_time.append(sum(avgs_iter)/len(avgs_iter))
NMIs
iters_time
fig = go.Figure()
trace = go.Scatter(
x = ks,y=NMIs,
name ="$NMIs$")
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$k$',
yaxis_title='$avg\_NMI$',
)
fig.show()
fig = go.Figure()
trace = go.Scatter(
x = ks,y=iters_time,
name ="$NMIs$")
fig.add_trace(trace)
fig.update_layout(
xaxis_title='$k$',
yaxis_title='$avg\_iters$',
)
fig.show()